#####################################################################################################
###### Replication file for "Income Measures in Cross-National Surveys: Problems and Solutions",
######      PSRM, conditionally accepted as of June 3, 2016
###### Authors: Michael J. Donnelly and Grigore Pop-Eleches
###### For questions about the code, please email Michael at michaeljamesdonnelly@gmail.com
#####################################################################################################
  date()

  rm(list = ls())                                           ## Start from a blank slate


  ## The script requires each of the following packages. If any are not installed on your machine,
  ## run the following, which is currently commented out:
  ##  install.packages("reldist")
  ##  install.packages("plotrix")
  ##  install.packages("lattice")
  ##  install.packages("ggplot2")
  ##  install.packages("foreign")
  ##  install.packages("questionr")

  library(reldist)
  library(plotrix)
  library(lattice)
  library(ggplot2)
  library(foreign)
  library(questionr)
  ## Set this path to the relevant directory
  replpath <- "C:/users/michael/desktop/IncomeReplication"
  ## The necessary data are in the /ReplicationData folder or the replication package
  setwd(paste(replpath, "/ReplicationData", sep = ""))



#####################################################################################################
###### Figure 1
#####################################################################################################

  ## This is the simplified main WVS file
  full <- read.dta("WVSincomeReplicationData.dta")

  ## This contains macro data, including an estimate of houshold size
  mcro <- subset(read.dta("macrovars2.dta"), select = c("conames", "year", "hhsize"))
  names(mcro) <- c("conames", "year", "hsize")
  full <- merge(full, mcro, by = c("conames", "year"), all.x = TRUE)


 ## Figure 1 has four country-waves. This selects those country-waves as separate files.
  egy00 <- subset(full, conames == "Egypt" & year == 2000)
  fin90 <- subset(full, conames == "Finland" & year == 1990)
  bra91 <- subset(full, conames == "Brazil" & year == 1991)
  nor96 <- subset(full, conames == "Norway" & year == 1996)

 ## For commentary about percentile of individuals in the 5th category
 ## This takes a weighted table and produces a cumulative weighted sum, then uses the midpoint of the
 ## fifth cagtegory as the estimate of the percentile for repsondents in that group

    bratab <- wtd.table(x = bra91$simpleInc, weights = bra91$weight)    ## Weighted frequency table
    brapercs <- bratab/sum(bratab)        ## turned into percentiles
    sum(brapercs[1:4]) + (sum(brapercs[1:5]) - sum(brapercs[1:4]))/2    ## 0.886

    fintab <- wtd.table(x = fin90$simpleInc, weights = fin90$weight)
    finpercs <- fintab/sum(fintab)
    sum(finpercs[1:4]) + (sum(finpercs[1:5]) - sum(finpercs[1:4]))/2    ## 0.135


  one.wave.dats <- list(bra91, egy00, fin90, nor96)     ## puts those four into a list
  laymat <- matrix(1:4, nrow = 2, byrow = TRUE)         ## the layout matrix (2x2 square)

 ## Return to the main folder
  setwd(replpath)
  png("Figure1.png")    ## open graphics device in png format
  layout(laymat)
  for (i in 1:4){
    ## This loops over the four data sets and plots a simple weighted histogram
    thisdat <- subset(one.wave.dats[[i]], !is.na(simpleInc))
    thisTitle <- paste(thisdat$conames[1], thisdat$year[1], sep = "\n")

    ## holder is a weighted histogram object that is not plotted yet
    holder <- weighted.hist(thisdat$simpleInc,
            w = thisdat$weight,
            freq = FALSE,
            breaks = c(seq(1, 11, by = 1)),
            axisnames = FALSE,
            plot = FALSE)
  if (i == 1){par(mar = c(1, 4, 5, 0))}     ## Slightly different margins based on where in the
  if (i == 2){par(mar = c(1, 1, 5, 1))}     ## square we are
  if (i == 3){par(mar = c(3, 4, 2, 0))}
  if (i == 4){par(mar = c(3, 1, 2, 1))}

 ## Now to plot them
  barplot(holder$density,
            axes = FALSE,
            space = 0,
            ylim = c(0, .3),
            col = "gray90",
            cex.lab = 1.5,
            main = thisTitle,
            xlim = c(0, 10))
  if (i == 1){ ## These if statements determine whether to add axes
    axis(side = 2,
        at = seq(0, .3, by = .1),
        labels = c("0", "10", "20", "30%"),
        las = 2)
    }
  if (i == 2){ ## No axes on the top-right panel
    }
  if (i == 3){
    axis(side = 2,
        at = seq(0, .3, by = .1),
        labels = c("0", "10", "20", "30%"),
        las = 2)
    axis(side = 1,
        at = seq(.5, 9.5, by = 1),
        labels = 1:10, cex.axis = .9)
  }
 if (i == 4){
    axis(side = 1,
        at = seq(.5, 9.5, by = 1),
        labels = 1:10, cex.axis = .9)
    }
 ##
  abline(h = .1, lty = "dashed")
 ##
  }
 dev.off()



#####################################################################################################
###### Figure 2
#####################################################################################################

  ## These creates a simplified macro data set based on our imputed data


  myginis <- by(full, list(full$abbr),
                function(x)gini(subset(x, !is.na(newinc))$newinc,
                weights = subset(x, !is.na(newinc))$weight)*100)
  gsoltginis <- by(full, list(full$abbr),
                function(x)mean(x$ggini, na.rm = TRUE))
  nsoltginis <- by(full, list(full$abbr),
                function(x)mean(x$ngini, na.rm = TRUE))

  pppGDPs <- by(full, list(full$abbr),
                function(x)weighted.mean(x$pppid*(100/x$cpiIndex),
                                        weights = x$weight,
                                        na.rm = TRUE))
  pppEST <- by(full, list(full$abbr),
                function(x)weighted.mean(x$pppinc/mean(x$hsize, na.rm = TRUE),
                                        weights = x$weight,
                                        na.rm = TRUE))

  GDPs <- by(full, list(full$abbr),
                function(x)weighted.mean(x$usd*(100/x$cpiIndex),
                                        weights = x$weight,
                                        na.rm = TRUE))
  EST <- by(full, list(full$abbr),
                function(x)weighted.mean(x$newinc/mean(x$hsize, na.rm = TRUE),
                                        weights = x$weight,
                                        na.rm = TRUE))
  ## This sticks all of those aggregated measures into a single data frame
  mydat <- data.frame(matrix(NA, nrow = length(EST), ncol = 7))
  names(mydat) <- c("abbr", "EST", "GDP", "pppEST", "pppGDP",
                    "mygini", "soltgini")


  mydat$abbr <- dimnames(EST)[[1]]
  mydat$pppGDP <- pppGDPs
  mydat$GDP <- GDPs
  mydat$pppEST <- pppEST
  mydat$EST <- EST
  mydat$mygini <- myginis
  mydat$mygini <- ifelse(mydat$mygini == 0, NA, mydat$mygini)   ## the gini function spits out 0 for
                                                                ## missing data
  mydat$gsoltgini <- gsoltginis
  mydat$nsoltgini <- nsoltginis
  axpts <- seq(0, 60000, by = 10000)
  axlabs <- c("0", "$10,000", "$20,000",
            "$30,000", "$40,000", "$50,000",
            "$60,000")

  ## A function taken from
  ## https://stat.ethz.ch/pipermail/r-help/2011-October/291396.html
  ## This simply formats the R squared
r2format <- function(object, digits = 3, output, sub, expression = TRUE, ...) {
  if (inherits(object, "lm")) {
    x <- summary(object)
  } else if (inherits(object, "summary.lm")) {
    x <- object
  } else stop("object is an unmanageable class")
  out <- format(x$r.squared, digits = digits)

  if (!missing(output)) {
    output <- gsub(sub, out, output)
  } else {
    output <- out
  }
  if (expression) {
    output <- parse(text = output)
  }
  return(output)
}


  ## Now for the actual plot, which shows the relationship between estimated and
  png("Figure2.png", width = 6, height = 4, units = "in", res = 960)
   m1 <- lm(mydat$pppEST ~ mydat$pppGDP)    ## Simple OLS model for comparison
   plot(x = 0, y = 0, pch = "",             ## plots nothing, but sets up the plot window
    axes = FALSE,
    ylim = c(0, 30000),
    xlim = c(0, 60000),
    xlab = "World Bank GDP per capita (PPP)",
    ylab = "World Values personal income per capita (PPP)")
   text(x = mydat$pppGDP, y = mydat$pppEST, labels = mydat$abbr, cex = .4)  ## puts the abbreviations
                                                                            ## in as points
   axis(side = 1, at = axpts, labels = axlabs, cex.axis = .5)
   axis(side = 2, at = axpts, labels = axlabs, cex.axis = .5, las = 2)
   abline(m1, lty = "dashed")                                               ## ols line
   text(52000, 22000, r2format(m1, 2, "R^2 == rval", "rval"))
   abline(a = 0, b = 1)                                                     ## 45 degree line
  dev.off()

#####################################################################################################
###### Figure 2
#####################################################################################################

  setwd(paste(replpath, "/ReplicationData/", sep = ""))

  ## The next four sections pull out a single country-wave from another cross-national survey to show
  ## examples of realized income distributions
  cses3 <- read.dta("FRA07csesIncome.dta")              ## Comparative Study of Electoral Systems
                                                        ## France 2007 income and weight data
  cses3$simpleInc <- match(cses3$C2020, c("1. LOWEST HOUSEHOLD INCOME QUINTILE",
                                        "2. SECOND HOUSEHOLD INCOME QUINTILE",
                                        "3. THIRD HOUSEHOLD INCOME QUINTILE",
                                        "4. FOURTH HOUSEHOLD INCOME QUINTILE",
                                        "5. HIGHEST HOUSEHOLD INCOME QUINTILE"))

  fra07 <- subset(cses3, C1004 == "FRA_2007" & !is.na(cses3$simpleInc) &
                                            !is.na(cses3$C1010_2))      ## removing missings because
                                                                        ## weighted.hist() doesn't
                                                                        ## work with missing data


  ISSPsi <- read.dta("ISSPsiIncome.dta")                ## ISSP Social Inequality Module
  conames <- read.csv("isspsicountries.csv")            ## For matching country names

  ISSPsi <- merge(ISSPsi, conames, by = "country")
  ISSPsi$country <- ISSPsi$newcountry
  ISSPsi$cwv <- paste(ISSPsi$country, ISSPsi$year)

  De99 <- subset(ISSPsi, cwv == "Germany 1999" & !is.na(income) &
                                                !is.na(weight))


  ess <- read.dta("IrelandESSIncome6.dta")              ## European Social Survey
                                                        ## Ireland Round 6 income and weight data

  ess$cwv <- paste(ess$cntry, ess$essround)
  ess$simpleInc <- match(ess$inccode, c("J", "R", "C", "M", "F", "S",
                                        "K", "P", "D", "H", "U", "N"))  ## Income codes, turns "NA"
                                                                        ## into NA

  ire6 <- subset(ess, !is.na(ess$simpleInc) & !is.na(ess$dweight))


  Lat03 <- read.dta("Latvia2003.dta")                   ## Cand. Count. Eurobarometer
                                                        ## Latvia, 2003 income and weight data

  Lat03$simpleInc <- match(Lat03$d12, c("1st (lowest) income decile",
                                        "2nd income decile",
                                        "3rd income decile",
                                        "4th income decile",
                                        "5th income decile",
                                        "6th income decile",
                                        "7th income decile",
                                        "8th income decile",
                                        "9th income decile",
                                        "10th (highest) income decile"))
  Lat03 <- subset(Lat03, !is.na(simpleInc) & !is.na(weight1))


  ## Creates four weighted.hist() objects without plotting them
  holder1 <- weighted.hist(ire6$simpleInc,
    w = ire6$dweight,
    freq = FALSE,
    breaks = c(seq(1, 11, by = 1)),
    axisnames = FALSE,
    plot = FALSE)


  holder2 <- weighted.hist(fra07$simpleInc,
    w = fra07$C1010_2,
    freq = FALSE,
    breaks = c(seq(0.5, 5.5, by = 1)),
    axisnames = FALSE,
    plot = FALSE)


  holder3 <- weighted.hist(De99$income,
    w = De99$weight,
    freq = FALSE,
    breaks = c(seq(1, 11, by = 1)),
    axisnames = FALSE,
    plot = FALSE)


  holder4 <- weighted.hist(Lat03$simpleInc,
    w = Lat03$weight,
    freq = FALSE,
    breaks = c(seq(1, 11, by = 1)),
    axisnames = FALSE,
    plot = FALSE)

  ## Now for the plotting
  setwd(replpath)
  png("Figure3.png", width = 540, height = 360)
  layout(matrix(1:4, nrow = 2, ncol = 2)) ## square layout

    barplot(holder1$density,
        axes = FALSE,
        space = 0,
        ylim = c(0, .2),
        col = "gray90",
        cex.lab = 1.5,
        main = "Irish Income - ESS 2012/13",
        xlim = c(0, 10))
    axis(side = 2,
        at = seq(0, .2, by = .1),
        labels = c("0", "10", "20%"),
        las = 2)
    axis(side = 1,
        at = seq(.5, 9.5, by = 1),
        labels = 1:10, cex.axis = .9)
    barplot(holder2$density,
        axes = FALSE,
        space = 0,
        ylim = c(0, .35),
        col = "gray90",
        cex.lab = 1.5,
        main = "French Income - CSES 2007",
        xlim = c(0, 5))
    axis(side = 2,
        at = seq(0, .3, by = .1),
        labels = c("0", "10", "20", "30%"),
        las = 2)
    axis(side = 1,
        at = seq(.5, 4.5, by = 1),
        labels = 1:5, cex.axis = .9)

    barplot(holder3$density,
        axes = FALSE,
        space = 0,
        ylim = c(0, .2),
        col = "gray90",
        cex.lab = 1.5,
        main = "German Income - ISSP 1999",
        xlim = c(0, 10))
    axis(side = 2,
        at = seq(0, .2, by = .1),
        labels = c("0", "10", "20%"),
        las = 2)
    axis(side = 1,
        at = seq(.5, 9.5, by = 1),
        labels = 1:10, cex.axis = .9)

    barplot(holder4$density,
        axes = FALSE,
        space = 0,
        ylim = c(0, .3),
        col = "gray90",
        cex.lab = 1.5,
        main = "Latvian Income - CCEB 2003",
        xlim = c(0, 10))
    axis(side = 2,
        at = seq(0, .3, by = .1),
        labels = c("0", "10", "20", "30%"),
        las = 2)
    axis(side = 1,
        at = seq(.5, 9.5, by = 1),
        labels = 1:10, cex.axis = .9)

  dev.off()


#####################################################################################################
###### Individual-level correlations between income derived from bands and imputed income
#####################################################################################################

  full$originc <- exp(full$loginc)
  cor(x = full$originc, y = full$impinc, use = "pairwise.complete.obs") # 0.918
  by(full, full$simpleInc, function(dat){cor(x = dat$originc, y = dat$impinc,
                                                            use = "pairwise.complete.obs")})
        ## Numbers range from 0.82 to 0.89
  full$cwv <- paste(full$conames, full$wv)
  cwvcors <- unlist(by(full, full$cwv, function(dat){cor(x = dat$originc, y = dat$impinc,
                                                            use = "pairwise.complete.obs")}))
  subset(cwvcors, cwvcors < .9) ## Only Malta 1


#####################################################################################################
###### Appendix B
#####################################################################################################
 nr <- 5        ## number of rows and columns per page in the appendix
 nc <- 3

 fullnomiss <- subset(full, !is.na(simpleInc))

 ## The next few names are too long. This abbreviates them
 fullnomiss$conames <- ifelse(fullnomiss$conames == "Bosnia and Herzegovina",
                    "Bosnia", as.character(fullnomiss$conames))
 fullnomiss$conames <- ifelse(fullnomiss$conames == "Czech Republic",
                    "Czech R.", as.character(fullnomiss$conames))
 fullnomiss$conames <- ifelse(fullnomiss$conames == "Dominican Republic",
                    "Dominican R.", as.character(fullnomiss$conames))
 fullnomiss$conames <- ifelse(fullnomiss$conames == "Northern Ireland",
                    "Nor. Ireland", as.character(fullnomiss$conames))
 fullnomiss$conames <- ifelse(fullnomiss$conames == "SwitzerlandSubinc",
                    "Switz. (Subj.)", as.character(fullnomiss$conames))

 # Fixing the year label in waves that straddled two years
 fullnomiss$wyr <- ifelse(fullnomiss$conames == "Slovakia" & fullnomiss$wv == 2, "1990/91",
                    ifelse(fullnomiss$conames == "Spain" & fullnomiss$wv == 4, "1999/2000",
                        ifelse(fullnomiss$conames == "Poland" & fullnomiss$wv == 3, "1989/90",
                            ifelse(fullnomiss$conames == "Colombia" & fullnomiss$wv == 2, "1997/98",
                                ifelse(fullnomiss$conames == "Czech Republic" & fullnomiss$wv == 2,
                                    "1990/91", as.character(fullnomiss$year))))))


 fullnomiss$CWave <- paste(fullnomiss$conames, fullnomiss$wyr, sep = " ")
 CWVS <- unique(fullnomiss$CWave)           ## Total number of country-waves
 pgs <- ceiling(length(CWVS)/(nr*nc))       ## Number of pages necessary to include all country-waves


 setwd(paste(replpath, "/AppendixB/", sep = ""))
 for (i in 1:pgs){
    thisblock <- ((i - 1)*nr*nc + 1):(i*nr*nc)
    thesesvys <- sort(CWVS)[thisblock]                  ## Selects out the surveys to be plotted on
                                                        ## this page
    home <- subset(fullnomiss, !is.na(match(CWave, thesesvys)))

    pname <- paste("AppBpage", i, ".pdf", sep = "")     ## Name of the pdf file
    pdf(pname)
        pt <- qplot(x = simpleInc, ..density..,         ## Standard ggplot2 histogram
                    data=home, geom="histogram",
                  weight=weight, binwidth = 1,
                  boundary = .5) +
        xlab("") +
        ylab("") + theme_bw() +
        scale_x_continuous(breaks = 1:10)+
        theme(axis.title.y = element_text(size = 15, angle = 90)) +
        theme(strip.text.x = element_text(size = 15)) +
        ylim(0, .5) +
        facet_wrap(~CWave, ncol = nc, nrow = nr)
        print(pt)
    dev.off()
    print(i)                                            ## for monitoring progress

 }

 ## Finished
 setwd(replpath)
 date()
